This notebook is used to create a graph between courses based on common themes identified using topic detection.
In [1]:
# Useful starting lines
%matplotlib inline
import os
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, PCA, TruncatedSVD, SparsePCA
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy import sparse, stats, spatial
import pickle
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
%load_ext autoreload
%autoreload 2
In [2]:
# Load the courses descriptions dataframe
courses = pd.read_pickle("../data/cleaned_courses_STI.pickle")
courses.head()
Out[2]:
In [3]:
def keywords_to_list(columns, dataframe):
# Create a keywords dataframe per column and store in a list
keywords_df = list()
# Enumerate along the columns: "keywords_EN, keywords_FR ..."
for c, categ in enumerate(columns):
# Make a copy
df = dataframe[[categ]].copy()
# Join the elements in the lists with the ;
df[categ] = df[categ].apply(lambda x: ";".join(x))
# Split the dataframe on the ;
df = df[categ].str.split(";", expand = True)
keywords = list()
# Format all the strings
for col in df.columns:
df[col] = df[col].str.replace(r'^ +', "")
df[col] = df[col].str.replace(r' +$', "")
df[col] = df[col].str.replace(r'/|\\', " ")
df[col] = df[col].str.replace(r'\r', "")
df[col] = df[col].str.replace(r'\(|\)|:', "")
# Add the new keywords
keywords += list(df[col].unique())
# Append the cleaned dataframe
keywords_df.append(df)
# Get the full list of keywords
keywords = sorted(filter(None,list(set(keywords))))
return keywords, keywords_df
def split_words(x):
if type(x) is str:
x = x.lower()
for symbol in [";",",",".","\n"," - ", "- ","_", " "]:
if type(x) is not str:
temp = list()
for word in x:
temp += word.split(symbol)
x = temp
else:
x = x.split(symbol)
return list(filter(None, x))
else:
return []
In [4]:
keywords_df = list()
list_col = ["KeyWords_EN", "Summary_Concepts_Contents_EN"]
language = "english"
keywords, keywords_df = keywords_to_list(list_col, courses)
print("There are {} distinct keywords in english".format(len(keywords)))
In [5]:
# Concatenate the results of the different columns to obtain one line per course
result = pd.concat(keywords_df, axis=1, join_axes=[keywords_df[0].index])#pd.concat(keywords_df,axis=1, join='outer')
lda_data = list()
for row in result.values:
try:
interm_data = " ".join(list(set(filter(None, row))))
except:
interm_data = list(set(filter(None, row)))
lda_data.append(interm_data)
print("We have strings for {} courses".format(len(lda_data)))
result.head()
Out[5]:
In [6]:
distinct_words = lda_data.copy()
distinct_words = [split_words(x) for x in lda_data]
distinct_words_per_course = [len(list(set(list(filter(None, x))))) for x in distinct_words]
distinct_words = sum(distinct_words, [])
distinct_words = list(set(list(filter(None, distinct_words))))
med_num_distinct_words = np.median(np.array(distinct_words_per_course))
print("We have {} distinct words which make up the strings".format(len(distinct_words)))
print("Median number of distinct words per course : {}".format(med_num_distinct_words))
Functions used to perform the LDA
In [7]:
def print_top_words(model, feature_names, n_top_words):
topics_keywords = list()
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
topics_keywords.append([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print()
return topics_keywords
def perform_LDA(data_samples, language, n_samples, n_features, n_components, n_top_words):
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
stop_words = stopwords.words(language)
tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
max_features=n_features,
stop_words=stop_words)
tfidf = tf_vectorizer.fit_transform(data_samples)
pca = PCA()
pca_tfidf = pca.fit_transform(tfidf.toarray())
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=25,
learning_method='batch',
learning_offset=50.,
random_state=0)
result = lda.fit_transform(pca_tfidf-np.min(pca_tfidf))
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
topics_keywords = print_top_words(lda, tf_feature_names, n_top_words)
return result, lda
Performing LDA on the previously formatted keywords
In [8]:
n_samples = len(lda_data)
n_features = len(distinct_words)
n_components = 10
n_top_words = 20
print(n_samples, n_features)
result_lda,lda = perform_LDA(lda_data, language,n_samples, n_features, n_components, n_top_words)
In [9]:
plt.matshow(lda.components_)
plt.colorbar()
Out[9]:
In [10]:
result_lda
Out[10]:
In [11]:
feature_vectors = result_lda.copy()
feature_vectors -= feature_vectors.mean(axis=0)
feature_vectors /= feature_vectors.std(axis=0)
distances = spatial.distance.pdist(feature_vectors, metric = "cosine")
distances = spatial.distance.squareform(distances)
kernel_width = (distances.mean())
weights = np.exp(-np.power(distances, 2)/(kernel_width**2))
weights[np.diag_indices_from(weights)] = 0
plt.matshow(weights);
plt.colorbar();
Saving the weight matrix
In [12]:
pkl_file = open(os.path.join(os.getcwd(), "Graphs","topics_graph.pkl"), "wb")
pickle.dump(weights, pkl_file)
pkl_file.close()
In [ ]: